## This file cleans U.S. Census Bureau's Small Area Health Insurance Estimates, ##
## downloaded from: https://www.census.gov/data/datasets/time-series/demo/sahie/estimates-acs.html ##
## Created by Meredith Dost and last run 8/16/2025 ##

# load in packages
library(tidyr)

# set working directory
#setwd("demographic_data/input_data/)

#### 2009 ####
# read in original data
sahie <- read.csv("sahie_2009_modified_colnames.csv", row.names=NULL)
# create unique county-state fips code
sahie$fips <- sahie$statefips*1000 + sahie$countyfips
# subset to 18-64 year-olds
sahie <- subset(sahie, agecat==1)
# subset to only county-level estimates (removing state-level)
sahie <- subset(sahie, geocat==50)
# subset to all races and all sexes
sahie <- subset(sahie, (racecat==0 & sexcat==0))
# subset to at or below 138% of FPL AND all imcomes
sahie <- subset(sahie, (iprcat==3 | iprcat==0))
# keeping only vars we need
sahie <- sahie[c("fips","NIPR","iprcat")]
sahie$NIPR <- as.numeric(as.character(sahie$NIPR))
# changing format from long to wide
sahie_wide <- pivot_wider(sahie, names_from = "iprcat", values_from = "NIPR")
colnames(sahie_wide)[2:3] <- c("pop_18to64","pop_18to64_lt138inc")
# compute percent age 18-64 w/<138% FPL
sahie_wide$pcteligib <- sahie_wide$pop_18to64_lt138inc/sahie_wide$pop_18to64
# compute indicator of if pcteligib is >= median national value in this year
sahie_wide$higheligib <- ifelse(sahie_wide$pcteligib>=median(sahie_wide$pcteligib, na.rm=T), 1, 0)
# add year (w/lag)
sahie_wide$year <- 2010
# only keep vars of interest
sahie09 <- sahie_wide[,c(1,4,5,6)]
# clean up environment
rm(list=setdiff(ls(), c("sahie09")))

#### 2011 ####
# read in original data
sahie <- read.csv("sahie_2011_modified_colnames.csv", row.names=NULL)
# create unique county-state fips code
sahie$fips <- sahie$statefips*1000 + sahie$countyfips
# subset to 18-64 year-olds
sahie <- subset(sahie, agecat==1)
# subset to only county-level estimates (removing state-level)
sahie <- subset(sahie, geocat==50)
# subset to all races and all sexes
sahie <- subset(sahie, (racecat==0 & sexcat==0))
# subset to at or below 138% of FPL AND all incomes
sahie <- subset(sahie, (iprcat==3 | iprcat==0))
# keeping only vars we need
sahie <- sahie[c("fips","NIPR","iprcat")]
sahie$NIPR <- as.numeric(as.character(sahie$NIPR))
# changing format from long to wide
sahie_wide <- pivot_wider(sahie, names_from = "iprcat", values_from = "NIPR")
colnames(sahie_wide)[2:3] <- c("pop_18to64","pop_18to64_lt138inc")
# compute percent age 18-64 w/<138% FPL
sahie_wide$pcteligib <- sahie_wide$pop_18to64_lt138inc/sahie_wide$pop_18to64
# compute indicator of if pcteligib is >= median national value in this year
sahie_wide$higheligib <- ifelse(sahie_wide$pcteligib>=median(sahie_wide$pcteligib, na.rm=T), 1, 0)
# add year (w/lag)
sahie_wide$year <- 2012
# only keep vars of interest
sahie11 <- sahie_wide[,c(1,4,5,6)]
# clean up environment
rm(list=setdiff(ls(), c("sahie09","sahie11")))

#### 2013 ####
# read in original data
sahie <- read.csv("sahie_2013_modified_colnames.csv", row.names=NULL)
# create unique county-state fips code
sahie$fips <- sahie$statefips*1000 + sahie$countyfips
# subset to 18-64 year-olds
sahie <- subset(sahie, agecat==1)
# subset to only county-level estimates (removing state-level)
sahie <- subset(sahie, geocat==50)
# subset to all races and all sexes
sahie <- subset(sahie, (racecat==0 & sexcat==0))
# subset to at or below 138% of FPL AND all incomes
sahie <- subset(sahie, (iprcat==3 | iprcat==0))
# keeping only vars we need
sahie <- sahie[c("fips","NIPR","iprcat")]
sahie$NIPR <- as.numeric(as.character(sahie$NIPR))
# changing format from long to wide
sahie_wide <- pivot_wider(sahie, names_from = "iprcat", values_from = "NIPR")
colnames(sahie_wide)[2:3] <- c("pop_18to64","pop_18to64_lt138inc")
# compute percent age 18-64 w/<138% FPL
sahie_wide$pcteligib <- sahie_wide$pop_18to64_lt138inc/sahie_wide$pop_18to64
# compute indicator of if pcteligib is >= median national value in this year
sahie_wide$higheligib <- ifelse(sahie_wide$pcteligib>=median(sahie_wide$pcteligib, na.rm=T), 1, 0)
# add year (w/lag)
sahie_wide$year <- 2014
# only keep vars of interest
sahie13 <- sahie_wide[,c(1,4,5,6)]
# clean up environment
rm(list=setdiff(ls(), c("sahie09","sahie11","sahie13")))

#### 2015 ####
# read in original data
sahie <- read.csv("sahie_2015_modified_colnames.csv", row.names=NULL)
# create unique county-state fips code
sahie$fips <- sahie$statefips*1000 + sahie$countyfips
# subset to 18-64 year-olds
sahie <- subset(sahie, agecat==1)
# subset to only county-level estimates (removing state-level)
sahie <- subset(sahie, geocat==50)
# subset to all races and all sexes
sahie <- subset(sahie, (racecat==0 & sexcat==0))
# subset to at or below 138% of FPL AND all incomes
sahie <- subset(sahie, (iprcat==3 | iprcat==0))
# keeping only vars we need
sahie <- sahie[c("fips","NIPR","iprcat")]
sahie$NIPR <- as.numeric(as.character(sahie$NIPR))
# changing format from long to wide
sahie_wide <- pivot_wider(sahie, names_from = "iprcat", values_from = "NIPR")
colnames(sahie_wide)[2:3] <- c("pop_18to64","pop_18to64_lt138inc")
# compute percent age 18-64 w/<138% FPL
sahie_wide$pcteligib <- sahie_wide$pop_18to64_lt138inc/sahie_wide$pop_18to64
# compute indicator of if pcteligib is >= median national value in this year
sahie_wide$higheligib <- ifelse(sahie_wide$pcteligib>=median(sahie_wide$pcteligib, na.rm=T), 1, 0)
# add year (w/lag)
sahie_wide$year <- 2016
# only keep vars of interest
sahie15 <- sahie_wide[,c(1,4,5,6)]
# clean up environment
rm(list=setdiff(ls(), c("sahie09","sahie11","sahie13","sahie15")))

#### 2017 ####
# read in original data
sahie <- read.csv("sahie_2017_modified_colnames.csv", row.names=NULL)
# create unique county-state fips code
sahie$fips <- sahie$statefips*1000 + sahie$countyfips
# subset to 18-64 year-olds
sahie <- subset(sahie, agecat==1)
# subset to only county-level estimates (removing state-level)
sahie <- subset(sahie, geocat==50)
# subset to all races and all sexes
sahie <- subset(sahie, (racecat==0 & sexcat==0))
# subset to at or below 138% of FPL AND all incomes
sahie <- subset(sahie, (iprcat==3 | iprcat==0))
# keeping only vars we need
sahie <- sahie[c("fips","NIPR","iprcat")]
sahie$NIPR <- as.numeric(as.character(sahie$NIPR))
# changing format from long to wide
sahie_wide <- pivot_wider(sahie, names_from = "iprcat", values_from = "NIPR")
colnames(sahie_wide)[2:3] <- c("pop_18to64","pop_18to64_lt138inc")
# compute percent age 18-64 w/<138% FPL
sahie_wide$pcteligib <- sahie_wide$pop_18to64_lt138inc/sahie_wide$pop_18to64
# compute indicator of if pcteligib is >= median national value in this year
sahie_wide$higheligib <- ifelse(sahie_wide$pcteligib>=median(sahie_wide$pcteligib, na.rm=T), 1, 0)
# add year (w/lag)
sahie_wide$year <- 2018
# only keep vars of interest
sahie17 <- sahie_wide[,c(1,4,5,6)]
# clean up environment
rm(list=setdiff(ls(), c("sahie09","sahie11","sahie13","sahie15","sahie17")))

#### 2019 ####
# read in original data
sahie <- read.csv("sahie_2019_modified_colnames.csv", row.names=NULL)
# create unique county-state fips code
sahie$fips <- sahie$statefips*1000 + sahie$countyfips
# subset to 18-64 year-olds
sahie <- subset(sahie, agecat==1)
# subset to only county-level estimates (removing state-level)
sahie <- subset(sahie, geocat==50)
# subset to all races and all sexes
sahie <- subset(sahie, (racecat==0 & sexcat==0))
# subset to at or below 138% of FPL AND all incomes
sahie <- subset(sahie, (iprcat==3 | iprcat==0))
# keeping only vars we need
sahie <- sahie[c("fips","NIPR","iprcat")]
sahie$NIPR <- as.numeric(as.character(sahie$NIPR))
# changing format from long to wide
sahie_wide <- pivot_wider(sahie, names_from = "iprcat", values_from = "NIPR")
colnames(sahie_wide)[2:3] <- c("pop_18to64","pop_18to64_lt138inc")
# compute percent age 18-64 w/<138% FPL
sahie_wide$pcteligib <- sahie_wide$pop_18to64_lt138inc/sahie_wide$pop_18to64
# compute indicator of if pcteligib is >= median national value in this year
sahie_wide$higheligib <- ifelse(sahie_wide$pcteligib>=median(sahie_wide$pcteligib, na.rm=T), 1, 0)
# add year (w/lag)
sahie_wide$year <- 2020
# only keep vars of interest
sahie19 <- sahie_wide[,c(1,4,5,6)]
# clean up environment
rm(list=setdiff(ls(), c("sahie09","sahie11","sahie13","sahie15","sahie17","sahie19")))

### merge ###
sahie <- rbind.data.frame(sahie09,sahie11,sahie13,sahie15,sahie17,sahie19)


#setwd("demographic_data/)
write.table(sahie, "sahie_county_data.csv",row.names=F,sep=",")
